I will make these a bit prettier, but basically they just convert the existing data frames with distance/ similarity info into “distance” matrices.
#### useful functions ####
convert_similarity_to_matrix <- function(wide_data) {
#work into symmetric matrix (messy)
#convert to matrix
m <- as.matrix(wide_data)
#add extra row
m <- rbind(m,c(rep(NA,ncol(m)-1),1.0))
row.names(m)[nrow(m)] <- colnames(m)[ncol(m)]
#add extra column
m <- cbind(c(1.0,rep(NA,nrow(m)-1)),m)
colnames(m)[1] <- row.names(m)[1]
#convert into symmetric matrix (using forceSymmetric in Matrix package)
diag(m) <- 1
m <- forceSymmetric(m)
#return
m
}
convert_similarity_to_distance <- function(wide_data, col_name,reverse_dist=T) {
#convert human similarity values based on a similarity column name
#extract subset of wide human data
temp <- as.data.frame(wide_data) %>%
select(animal1,animal2, !!col_name) %>%
spread(animal2,!!col_name,fill="", convert=T)
#change animal1 column to row name
row.names(temp) <- temp$animal1
temp <- temp %>%
select(-animal1)
#convert to symmetric matrix
temp <- convert_similarity_to_matrix(temp)
if (reverse_dist) {
#convert from similarity to "distance"
temp <- 1- temp
}
#return
temp
}
LANG_ANIMAL_DISTANCE_COLOR <- here("data/processed/animal_color_distances_language_wiki.csv")
LANG_ANIMAL_DISTANCE_COLOR
## [1] "/Users/martinzettersten/Documents/GitHub/keb_2019_reanalysis/data/processed/animal_color_distances_language_wiki.csv"
LANG_ANIMAL_DISTANCE_SHAPE<- here("data/processed/animal_shape_distances_language_wiki.csv")
LANG_ANIMAL_DISTANCE_TEXTURE <- here("data/processed/animal_texture_distances_language_wiki.csv")
TIDY_HUMAN_PATH <- here("data/processed/tidy_human_data.csv")
language_data <- read_csv(LANG_ANIMAL_DISTANCE_COLOR) %>%
left_join(read_csv(LANG_ANIMAL_DISTANCE_SHAPE), by = c("animal1", "animal2")) %>%
left_join(read_csv(LANG_ANIMAL_DISTANCE_TEXTURE),by = c("animal1", "animal2")) %>%
select(-contains("PCA"))
human_data <- read_csv(TIDY_HUMAN_PATH)
human_data_wide <- human_data %>%
unite("measure", c("participant_type", "similarity_type")) %>%
spread(measure, human_similarity)
sighted_human_color_clust <- human_data_wide %>%
convert_similarity_to_distance("sighted_human_similarity_color") %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(sighted_human_color_clust, rotate = T) +
ggtitle("Sighted Human Similarity Color")
blind_human_color_clust <- human_data_wide %>%
convert_similarity_to_distance("blind_human_similarity_color") %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(blind_human_color_clust, rotate = T) +
ggtitle("Blind Human Similarity Color")
language_color_clust <- language_data %>%
convert_similarity_to_distance("language_similarity_simple_dist_color", reverse_dist=F) %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(language_color_clust, rotate = T) +
ggtitle("Language Distances Color")
Lower entanglement is better (0 is perfect alignment).
dends <- dendlist(as.dendrogram(blind_human_color_clust), as.dendrogram(sighted_human_color_clust))
x <- tanglegram(dends, common_subtrees_color_branches = TRUE, highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.5"
dends <- dendlist(as.dendrogram(blind_human_color_clust), as.dendrogram(sighted_human_color_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.22"
See https://cran.r-project.org/web/packages/dendextend/vignettes/introduction.html#the-fowlkes-mallows-index-and-the-bk-plot for details.
The FM-Index is a measure of similarity between two clusterings (higher means greater similarity). The Bk plot shows the FM-INdex for various values of k, where k is the number of clusters. The dashed black line shows the expected value assuming no connection between the clusterings, the red line shows the critical significance level. Dotted points above indicate significantly similar clusterings, i.e. better than one would expect from simply reshuffling labels.
Bk_plot(as.dendrogram(blind_human_color_clust), as.dendrogram(sighted_human_color_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a smaller cluster number and a cluster number with high similarity based on the Bk plot, so “double-dipping”.
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(blind_human_color_clust, k=5), cutree(sighted_human_color_clust, k=5))
## Rand HA MA FM Jaccard
## 0.7057471 0.1913559 0.2689056 0.3846865 0.2380952
#20 clusters
print("20 CLUSTERS")
## [1] "20 CLUSTERS"
adjustedRand(cutree(blind_human_color_clust, k=20), cutree(sighted_human_color_clust, k=20))
## Rand HA MA FM Jaccard
## 0.9586207 0.2868852 0.6573604 0.3113996 0.1818182
Lower entanglement is better (0 is perfect alignment).
dends <- dendlist(as.dendrogram(sighted_human_color_clust), as.dendrogram(language_color_clust))
x <- tanglegram(dends, common_subtrees_color_branches = TRUE, highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.64"
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.21"
Bk_plot(as.dendrogram(sighted_human_color_clust), as.dendrogram(language_color_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a smaller cluster number and a cluster number with high similarity based on the Bk plot, so “double-dipping”.
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(sighted_human_color_clust, k=5), cutree(language_color_clust, k=5))
## Rand HA MA FM Jaccard
## 0.63218391 -0.02455397 0.07599908 0.21568627 0.12087912
#20 clusters
print("20 CLUSTERS")
## [1] "20 CLUSTERS"
adjustedRand(cutree(sighted_human_color_clust, k=20), cutree(language_color_clust, k=20))
## Rand HA MA FM Jaccard
## 0.9448276 0.1714286 0.5714286 0.2000000 0.1111111
Lower entanglement is better (0 is perfect alignment).
dends <- dendlist(as.dendrogram(blind_human_color_clust), as.dendrogram(language_color_clust))
x <- tanglegram(dends, common_subtrees_color_branches = TRUE, highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.56"
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.31"
Bk_plot(as.dendrogram(blind_human_color_clust), as.dendrogram(language_color_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a smaller cluster number and a cluster number with high similarity based on the Bk plot, so “double-dipping”.
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(blind_human_color_clust, k=5), cutree(language_color_clust, k=5))
## Rand HA MA FM Jaccard
## 0.636781609 0.001829906 0.097555403 0.240429078 0.136612022
#20 clusters
print("21 CLUSTERS")
## [1] "21 CLUSTERS"
adjustedRand(cutree(blind_human_color_clust, k=21), cutree(language_color_clust, k=21))
## Rand HA MA FM Jaccard
## 0.9563218 0.1518728 0.6191537 0.1754116 0.0952381
sighted_human_shape_clust <- human_data_wide %>%
convert_similarity_to_distance("sighted_human_similarity_shape") %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(sighted_human_shape_clust, rotate = T) +
ggtitle("Sighted Human Similarity shape")
blind_human_shape_clust <- human_data_wide %>%
convert_similarity_to_distance("blind_human_similarity_shape") %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(blind_human_shape_clust, rotate = T) +
ggtitle("Blind Human Similarity shape")
language_shape_clust <- language_data %>%
convert_similarity_to_distance("language_similarity_simple_dist_shape", reverse_dist=F) %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(language_shape_clust, rotate = T) +
ggtitle("Language Distances shape")
dends <- dendlist(as.dendrogram(blind_human_shape_clust), as.dendrogram(sighted_human_shape_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.05"
Bk_plot(as.dendrogram(blind_human_shape_clust), as.dendrogram(sighted_human_shape_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a small cluster number (k=5).
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(blind_human_shape_clust, k=5), cutree(sighted_human_shape_clust, k=5))
## Rand HA MA FM Jaccard
## 0.7471264 0.3708914 0.4209172 0.5461264 0.3750000
dends <- dendlist(as.dendrogram(sighted_human_shape_clust), as.dendrogram(language_shape_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.15"
Bk_plot(as.dendrogram(sighted_human_shape_clust), as.dendrogram(language_shape_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a small cluster number (k=5).
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(sighted_human_shape_clust, k=5), cutree(language_shape_clust, k=5))
## Rand HA MA FM Jaccard
## 0.6919540 0.2032967 0.2719502 0.4102403 0.2555556
dends <- dendlist(as.dendrogram(blind_human_shape_clust), as.dendrogram(language_shape_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.17"
Bk_plot(as.dendrogram(blind_human_shape_clust), as.dendrogram(language_shape_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a small cluster number (k=5)
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(blind_human_shape_clust, k=5), cutree(language_shape_clust, k=5))
## Rand HA MA FM Jaccard
## 0.7103448 0.2205631 0.2926647 0.4123691 0.2588235
sighted_human_skin_clust <- human_data_wide %>%
convert_similarity_to_distance("sighted_human_similarity_skin") %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(sighted_human_skin_clust, rotate = T) +
ggtitle("Sighted Human Similarity skin")
blind_human_skin_clust <- human_data_wide %>%
convert_similarity_to_distance("blind_human_similarity_skin") %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(blind_human_skin_clust, rotate = T) +
ggtitle("Blind Human Similarity skin")
language_skin_clust <- language_data %>%
convert_similarity_to_distance("language_similarity_simple_dist_skin", reverse_dist=F) %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(language_skin_clust, rotate = T) +
ggtitle("Language Distances skin")
dends <- dendlist(as.dendrogram(blind_human_skin_clust), as.dendrogram(sighted_human_skin_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.13"
Bk_plot(as.dendrogram(blind_human_skin_clust), as.dendrogram(sighted_human_skin_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a small cluster number (k=5).
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(blind_human_skin_clust, k=5), cutree(sighted_human_skin_clust, k=5))
## Rand HA MA FM Jaccard
## 0.7333333 0.3609261 0.4085522 0.5489591 0.3661202
dends <- dendlist(as.dendrogram(sighted_human_skin_clust), as.dendrogram(language_skin_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.25"
Bk_plot(as.dendrogram(sighted_human_skin_clust), as.dendrogram(language_skin_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a small cluster number (k=5).
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(sighted_human_skin_clust, k=5), cutree(language_skin_clust, k=5))
## Rand HA MA FM Jaccard
## 0.58160920 0.01367917 0.08407703 0.30439039 0.17647059
dends <- dendlist(as.dendrogram(blind_human_skin_clust), as.dendrogram(language_skin_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.22"
Bk_plot(as.dendrogram(blind_human_skin_clust), as.dendrogram(language_skin_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a small cluster number (k=5)
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(blind_human_skin_clust, k=5), cutree(language_skin_clust, k=5))
## Rand HA MA FM Jaccard
## 0.66896552 0.08477251 0.17349047 0.30132612 0.17714286